set more off

//// Directory Path
cap cd "YOUR WORKING DIRECTORY HERE"

cap log close



********************************************************************************
*
* This is the STATA do file to build Appendix figures S6 to S8 and Appendix tables S-1 to S-4
*    
********************************************************************************

********************************
* Figures S6 to S8 start
***    

// Load the patent-mesh dataset 
use "clean_data/patent_data/patent_gbd_level.dta", clear

// Drop the previous tag variables
drop *tag 

// Generate patent tag variable 
egen patent_tag = tag(patent_id)
replace patent_tag = 0 if mi(patent_id)

// Keep unique patent observations only 
keep if patent_tag 

gen sub_drug = subcategory_id == 31
gen sub_surgery_inst = subcategory_id == 32
gen sub_biotech = subcategory_id == 33
gen sub_miscellaneous = subcategory_id == 39



gen patent_only_female = patent_female & !patent_male
gen patent_only_male = !patent_female & patent_male

gen patent_net_fm = patent_only_female - patent_only_male

merge 1:1 patent_id using "clean_data/patent_data/inventor_gender_counts.dta"
drop if _m == 2
drop _m

gen all_female_team = female_count ==  pat_team_sz 
gen all_female = female_count ==  inventor_gender_count 

keep if sub_drug == 1


// We double the data so that we have 3 groups we can plot by:
// (1) All patents, (2) female majority, and (3) male majority
expand 2, gen(expander)

replace pat_female_member = 2 if expander == 1


// Majority female?
// Majority female?
drop if female_count == .
drop if male_count == .

gen pat_majority_female = female_count >= male_count  

replace pat_majority_female = 2 if expander == 1

grstyle init plain, replace
grstyle set plain, horizontal nogrid



gen pat_majority_female100 = pat_majority_female*100
gen patent_female100 = patent_female*100
gen patent_male100 = patent_male*100
gen patent_net_fm100 = patent_net_fm*100


**************   Figure S6   ****************

binscatter pat_majority_female100 patent_year if pat_majority_female != 2, ///
	discrete ///
	ylabel(05(05)30, gmin gmax labsize(small)) line(connect) msymbols(none none none) legend(off) ///
	xlabel(1980(10)2010) ///
	lcolor(black)  xtitle("") ytitle("") yline(0, lcolor(gray) lpattern(dash)) ///
	title("Percent Female Majority Team" "A", size(normal))
graph save "figures/all1", replace	

binscatter patent_female100 patent_year, ///
	by(pat_majority_female) discrete ///
	line(connect) msymbols(none none none) ///
	xlabel(1980(10)2010) ///
	ylabel(02(01)18, gmax gmin labsize(small)) ymtick(02(02)18, gmax gmin) ///
	lcolor(gray red black) xtitle("") ytitle("") ///
	title("Percent Female Focused" "B", size(normal)) ///
	legend( ///
		lab(1 "Male Majority Team") ///
		lab(2 "Female Majority Team") ///
		lab(3 "All Patents") ////
		symxsize(5) size(4) rowgap(0.25) ///
		cols(1) ///
		order(3 2 1) ///
		ring(0)  bplacement(nwest) region(lstyle(none) color(none) margin(small)) ///
	)
graph save "figures/all2", replace	

binscatter patent_male100 patent_year, ///
	by(pat_majority_female) discrete  ///
	line(connect) msymbols(none none none) legend(off) ///
	xlabel(1980(10)2010) ///
	ylabel(02(01)18, gmax gmin labsize(small)) ymtick(02(02)18, gmax gmin) ///
	lcolor(gray red black)  xtitle("") ytitle("") title("Percent Male Focused" "C", size(normal))  l2("")
graph save "figures/all3", replace	
 
 
binscatter patent_net_fm100 patent_year, ///
	by(pat_majority_female) discrete ///
	ylabel(-05(01)05, gmin gmax labsize(small)) line(connect) msymbols(none none none) legend(off) ///
	ymtick(-05(01)05, gmax gmin) xlabel(1980(10)2010) ///
	lcolor(gray red black)  xtitle("") ytitle("") yline(0, lcolor(gray) lpattern(dash)) ///
	title("Net Focus: Percent Female - Male" "D", size(normal))
graph save "figures/all4", replace	

graph combine "figures/all1" "figures/all2" "figures/all3" "figures/all4",  ///
	row(1) xsize(9) ysize(3) 
graph export "figures/all_trends_drugs.pdf", as(pdf) replace










// Load the patent-mesh dataset 
use "clean_data/patent_data/patent_gbd_level.dta", clear

// Drop the previous tag variables
drop *tag 

// Generate patent tag variable 
egen patent_tag = tag(patent_id)
replace patent_tag = 0 if mi(patent_id)

// Keep unique patent observations only 
keep if patent_tag 

gen sub_drug = subcategory_id == 31
gen sub_surgery_inst = subcategory_id == 32
gen sub_biotech = subcategory_id == 33
gen sub_miscellaneous = subcategory_id == 39



gen patent_only_female = patent_female & !patent_male
gen patent_only_male = !patent_female & patent_male

gen patent_net_fm = patent_only_female - patent_only_male

merge 1:1 patent_id using "clean_data/patent_data/inventor_gender_counts.dta"
drop if _m == 2
drop _m

gen all_female_team = female_count ==  pat_team_sz 
gen all_female = female_count ==  inventor_gender_count 



keep if sub_surgery_inst == 1


// We double the data so that we have 3 groups we can plot by:
// (1) All patents, (2) female majority, and (3) male majority
expand 2, gen(expander)

replace pat_female_member = 2 if expander == 1


// Majority female?
drop if female_count == .
drop if male_count == .

// Majority female?
gen pat_majority_female = female_count >= male_count  

replace pat_majority_female = 2 if expander == 1


grstyle init plain, replace
grstyle set plain, horizontal nogrid



gen pat_majority_female100 = pat_majority_female*100
gen patent_female100 = patent_female*100
gen patent_male100 = patent_male*100
gen patent_net_fm100 = patent_net_fm*100


**************   Figure S7   ****************


binscatter pat_majority_female100 patent_year if pat_majority_female != 2, ///
	discrete ///
	ylabel(05(05)30, gmin gmax labsize(small)) line(connect) msymbols(none none none) legend(off) ///
	xlabel(1980(10)2010) ///
	lcolor(black)  xtitle("") ytitle("") yline(0, lcolor(gray) lpattern(dash)) ///
	title("Percent Female Majority Team" "A", size(normal))
graph save "figures/all1", replace	

binscatter patent_female100 patent_year, ///
	by(pat_majority_female) discrete ///
	line(connect) msymbols(none none none) ///
	xlabel(1980(10)2010) ///
	ylabel(02(02)32, gmax gmin labsize(small)) ymtick(02(02)32, gmax gmin) ///
	lcolor(gray red black) xtitle("") ytitle("") ///
	title("Percent Female Focused" "B", size(normal)) ///
	legend( ///
		lab(1 "Male Majority Team") ///
		lab(2 "Female Majority Team") ///
		lab(3 "All Patents") ////
		symxsize(5) size(4) rowgap(0.25) ///
		cols(1) ///
		order(3 2 1) ///
		ring(0) bplacement(neast) region(lstyle(none) color(none) margin(small)) ///
	)
graph save "figures/all2", replace	

binscatter patent_male100 patent_year, ///
	by(pat_majority_female) discrete  ///
	line(connect) msymbols(none none none) legend(off) ///
	xlabel(1980(10)2010) ///
	ylabel(02(02)32, gmax gmin labsize(small)) ymtick(02(02)32, gmax gmin) ///
	lcolor(gray red black)  xtitle("") ytitle("") title("Percent Male Focused" "C", size(normal))  l2("")
graph save "figures/all3", replace	
 
 
binscatter patent_net_fm100 patent_year, ///
	by(pat_majority_female) discrete ///
	ylabel(-05(05)25, gmin gmax labsize(small)) line(connect) msymbols(none none none) legend(off) ///
	ymtick(-05(05)25, gmax gmin) xlabel(1980(10)2010) ///
	lcolor(gray red black)  xtitle("") ytitle("") yline(0, lcolor(gray) lpattern(dash)) ///
	title("Net Focus: Percent Female - Male" "D", size(normal))
graph save "figures/all4", replace	

graph combine "figures/all1" "figures/all2" "figures/all3" "figures/all4",  ///
	row(1) xsize(9) ysize(3) 
graph export "figures/all_trends_surgery.pdf", as(pdf) replace


// Load the patent-mesh dataset 
use "clean_data/patent_data/patent_gbd_level.dta", clear

// Drop the previous tag variables
drop *tag 

// Generate patent tag variable 
egen patent_tag = tag(patent_id)
replace patent_tag = 0 if mi(patent_id)

// Keep unique patent observations only 
keep if patent_tag 

gen sub_drug = subcategory_id == 31
gen sub_surgery_inst = subcategory_id == 32
gen sub_biotech = subcategory_id == 33
gen sub_miscellaneous = subcategory_id == 39



gen patent_only_female = patent_female & !patent_male
gen patent_only_male = !patent_female & patent_male

gen patent_net_fm = patent_only_female - patent_only_male

merge 1:1 patent_id using "clean_data/patent_data/inventor_gender_counts.dta"
drop if _m == 2
drop _m

gen all_female_team = female_count ==  pat_team_sz 
gen all_female = female_count ==  inventor_gender_count 

// Majority female?
drop if female_count == .
drop if male_count == .

gen pat_majority_female = female_count >= male_count  


gen team = ""
replace team = "_men" if !pat_majority_female
replace team = "_women" if pat_majority_female

// Collapse data down

collapse  ///
	(count) pats = patent_female ///
	(sum) f_pats = patent_female, ///
	by(team patent_year) 
	
gen per_f_pats = f_pats / pats

reshape wide per_f_pats f_pats pats, i(patent_year) j(team) string
sort patent_year 
	
gen pats = pats_men + pats_women


gen f_count_equal = (pats/2)*(per_f_pats_women) + ///
					(pats/2)*(per_f_pats_men)

gen f_count_actual = pats_men*per_f_pats_men + pats_women*per_f_pats_women

gen delta_count = f_count_equal - f_count_actual


**************   Figure S8   ****************


twoway connected delta_count patent_year, ///
	xlabel(1980(10)2010) ylabel(0(50)250) ///
	xtitle("Year") ytitle("Number of patents") ///
	lcolor(black) msymbol(none) ///
	title("Estimated number of lost female-focused inventions per year", size(small))
graph export "figures/lost_inventions.pdf", as(pdf) replace
	
egen tot_female_lost = total(delta_count)
tab tot_female_lost


***    
* Figures S6 to S8 end
********************************


********************************************************************************
*
* This corresponds to the back of the envelope calculations in the appendix and discussed in the paper. Specifically, Appendix Section S-13
*    
********************************************************************************


// Broken out assuming teams of size 3 (median) for distributions, no matching
di  (441504*(0.25)*(0.013+0.13) + ///
     441504*(0.25)*(0.024+0.13) + ///
     441504*(0.25)*(0.046+0.13) + ///
     441504*(0.25)*(0.13)) - ///
	///
    (441504*(0.15)*(0.013+0.13) + ///
     441504*(0.09)*(0.024+0.13) + ///
     441504*(0.04)*(0.046+0.13) + ///
     441504*(0.72)*(0.13))
	

// FEs: Broken out assuming teams of size 2 (median) for distributions, mathcing FEs
di  (441504*(0.25)*(0.0067+0.13) + ///
     441504*(0.25)*(0.011+0.13) + ///
     441504*(0.25)*(0.028+0.13) + ///
     441504*(0.25)*(0.13)) - ///
	///
    (441504*(0.15)*(0.0067+0.13) + ///
     441504*(0.09)*(0.011+0.13) + ///
     441504*(0.04)*(0.028+0.13) + ///
     441504*(0.72)*(0.13))

	  
// PubMed,
// Median is team of size 5 so assume min/maj/all based off 50/50 male/female
// teams of size 4
di  (2062695*(0.484375)*(0.0084+0.37) + ///
     2062695*(0.484375)*(0.03+0.37) + ///
     2062695*(0.03125)*(0.045+0.37) + ///
     2062695*(0.03125)*(0.37)) - ///
	///
    (2062695*(0.38)*(0.0084+0.37) + ///
     2062695*(0.23)*(0.03+0.37) + ///
     2062695*(0.05)*(0.045+0.37) + ///
     2062695*(0.34)*(0.37))
	 
	 
di  (2062695*(0.484375)*(0.0074+0.37) + ///
     2062695*(0.484375)*(0.024+0.37) + ///
     2062695*(0.03125)*(0.041+0.37) + ///
     2062695*(0.03125)*(0.37)) - ///
	///
    (2062695*(0.38)*(0.0074+0.37) + ///
     2062695*(0.23)*(0.024+0.37) + ///
     2062695*(0.05)*(0.041+0.37) + ///
     2062695*(0.34)*(0.37))
	 

********************************************************************************
*
* Tables S-1 to S-4 in the Appendix
*    
********************************************************************************
set more off

// Rem
cap cd "YOUR WORKING DIRECTORY HERE"


insheet using "clean_data/patent_data/patents.csv", clear
tostring patent_id, force replace
merge 1:1 patent_id using "clean_data/patent_data/indexing_sample_weights_and_strata.dta"
drop _m
sort id
save "clean_data/patent_data/temp_for_indexer_analysis.dta", replace




insheet using "clean_data/patent_data/dump_sep17.csv", clear

rename patent_id id

merge m:1 id using "clean_data/patent_data/temp_for_indexer_analysis.dta"
keep if _m == 3

keep if username != "user3"
duplicates drop
duplicates drop id username mesh_term mesh_tree_num rank is_preassigned is_deleted, force

gen deleted = is_deleted == "t"

gen mesh_male = mesh_term == "MALE"
gen mesh_female = mesh_term == "FEMALE"

gen human_mesh_male = (mesh_male) & (!deleted)
gen human_mesh_female = (mesh_female) & (!deleted)

bys patent_id: egen human_male = total(human_mesh_male)
bys patent_id: egen human_female = total(human_mesh_female)

// Errors on terms look pretty good
binscatter deleted rank

keep patent_id human_male human_female strata strata_size sample_size pat_weight early pat_majority_female subcategory_id patent_female patent_male username
duplicates drop

bys patent_id: egen user_count = count(username)
keep if user_count == 2
drop username

keep patent_id human_male human_female strata strata_size sample_size pat_weight early pat_majority_female subcategory_id patent_female patent_male 

duplicates drop

gen human_male1 = human_male > 0
gen human_female1 = human_female > 0

gen human_male2 = human_male > 1
gen human_female2 = human_female > 1


svyset [pweight=pat_weight], fpc(strata_size) strata(strata)

svydescribe
svy: mean patent_female
svy: mean patent_male

svy: mean human_female2
svy: mean human_male2


gen fp_female = (human_female2 == 0) & (patent_female == 1)
gen fn_female = (human_female2 == 1) & (patent_female == 0)
gen error_female = fp_female | fn_female

gen fp_male = (human_male2 == 0) & (patent_male == 1)
gen fn_male = (human_male2 == 1) & (patent_male == 0)
gen error_male = fp_male | fn_male

gen surgery = subcategory == 32
gen post96 = !early
gen male_team = !pat_majority_female


// Build 8 tables
label var human_female2 "Female (Indexer)"
label var patent_female "Female (MTI)"

label var human_male2 "Male (Indexer)"
label var patent_male "Male (MTI)"


**************   S-1   ****************

// Confusion Matrix and precision/recall for the female MeSH term
// \label{t:female-validation}
svy: tab human_female2 patent_female, ci col

**************   S-2   ****************

// Confusion Matrix and precision/recall for the male MeSH term
// \label{t:male-validation}
svy: tab human_male2 patent_male, ci col

**************   S-3   ****************

// Precision/recall for the female MeSH term by gender of the invention team
// \label{t:female-validation-gender}
svy: tab human_female2 patent_female if pat_majority_female, ci col
svy: tab human_female2 patent_female if !pat_majority_female, ci col

**************   S-4   ****************

// Precision/recall for the male MeSH term by gender of the invention team
// \label{t:male-validation-gender}
svy: tab human_male2 patent_male if pat_majority_female, ci col
svy: tab human_male2 patent_male if !pat_majority_female, ci col
